****************************************
****************************************
* DATA CLEANING AND PANEL CONSTRUCTIONS
****************************************
****************************************

* IMPORTANT: make sure to run start.do before to set up the directories.  


/*------------------------- 
	Organization

		SECTION 1.  Clean Compustat Fundamentals Annual	
		
		SECTION 2. Estimate markups

		SECTION 3. Create variables of interest
		
-------------------------*/ 



clear
set more off
set max_memory 250g, perm
set seed 123456789

/**********************************************************************/
/*  SECTION 1. Clean Compustat Fundamentals Annual			
    Notes: 

    1.1 industry codes
    1.2 filters
    1.3 Interpolation
    1.4 merge auxiliary variables
    1.5 clean variables
    1.6 industry vars 

*/
/**********************************************************************/

use "${data}/Compustat_Raw.dta" if fic == "USA", clear
rename sale sales
destring gvkey, replace
rename fyear year
clonevar timevar = year
xtset gvkey timevar
gsort gvkey timevar


 
/*----------------------------------------------------*/
   /* [>   1.1 industry codes   <] */ 
/*----------------------------------------------------*/

gsort gvkey -year
by gvkey: replace sich = sich[_n-1] if mi(sich) 
destring sic, replace
replace sic = sich if !mi(sich)
generate sic2 = floor(sic/100)

by gvkey: replace naicsh = naicsh[_n-1] if mi(naicsh)
destring naics, replace
destring naicsh, replace
replace naics = naicsh if !mi(naicsh)


/*----------------------------------------------------*/
   /* [>   1.2 filters   <] */ 
/*----------------------------------------------------*/
drop if (mi(sic) | inrange(sic, 4900, 4999) | inrange(sic,6000,6999))
drop if mi(sales, cogs, xopr, sic2, ppegt, ppent, at)
keep if (at > 0)
keep if (sales > 0)
keep if (cogs > 0)
keep if (xopr > 0)
keep if (ppegt > 0)
keep if (aqc < 0.05*at | mi(aqc))

count if mi(naics)
count if mi(xsga)
count if mi(cusip)
count if mi(tic)

gen sga = xopr-cogs
gen cog2sal = cogs/sales
gen sga2sal= sga/sales

bys year: egen pc1=pctile(cog2sal), p(1)
bys year: egen pc99=pctile(cog2sal), p(99)

bys year: egen ps1=pctile(sga2sal), p(1)
bys year: egen ps99=pctile(sga2sal), p(99)

bys year: keep if inrange(cog2sal,pc1,pc99)
bys year: keep if inrange(sga2sal,ps1,ps99)


/*----------------------------------------------------*/
   /* [>   1.3 interpolation   <] */ 
/*----------------------------------------------------*/

* how many years in a row do we interpolate
count if mi(sales, cogs, ppent, ppegt, xopr)
gsort gvkey timevar
by gvkey (timevar): gen gap = timevar[_n] - timevar[_n-1] 
replace gap = gap - 1
tab gap

* fill gaps
count if mi(sales, cogs, sic2, xopr, ppegt, ppent, at)
generate tag = 0 /* tag observations originally in the data */
tsfill
count if mi(tag)
gsort gvkey -timevar
by gvkey: replace gap = gap[_n-1] if mi(gap) & _n != _N

* calculate gap of interpolated obs
gsort gvkey timevar
gen gaptoobig = gap > 1 & !mi(gap)
replace gaptoobig = 0 if !mi(tag) /* the observation was in the original sample */

drop if gaptoobig  /*  don't interpolate if the gap is longer than 1 year */

gen interpolatedcogs = mi(cogs)
gen interpolatedopex = mi(xopr)

drop gap tag gaptoobig

gsort gvkey timevar
foreach x of varlist sales cogs ppent ppegt xopr {
	by gvkey (timevar): ipolate `x' timevar, generate(`x'2)
	drop `x'
	rename `x'2 `x'
}

foreach x of varlist sic2 conm sic naics cusip tic {
	by gvkey (timevar): replace `x' = `x'[_n-1] if mi(`x') & !mi(`x'[_n-1])
}

count if mi(naics)
count if mi(sic)
count if mi(cusip)
count if mi(tic)
count if mi(sales, cogs, sic2, xopr, ppegt, ppent)

replace year = timevar if mi(year) /* because of the interpolation */

/*----------------------------------------------------*/
   /* [>   1.4 merge auxiliary variables <] */ 
/*----------------------------------------------------*/

merge m:1 year using "${data}/Nipa.dta", nogen assert(2 3) keep(3)
merge m:1 year using "${data}/INVDEF.dta", nogen assert(2 3) keep(3)
merge m:1 year using "${data}/FEDFUNDS.dta", nogen keep(1 3)
merge m:1 year using "${data}/Inflation.dta", nogen keep(1 3)
gsort gvkey timevar


/*------------------------------------ End of SECTION 1 ------------------------------------*/

/**********************************************************************/
/*  SECTION 2. Clean variables of interest
    Notes: */
/**********************************************************************/

gsort gvkey timevar
xtset gvkey timevar

* edit aux variables
replace fedfunds = fedfunds/100
replace inflation = inflation/100

* save the nominal sales data
clonevar sales_nominal = sales

* deflate variables
foreach var of varlist sales cogs xopr xad xsga xrent xlr xrd fca recd txs at aqc dlc dltt mkvalt prcc_f gp {
	replace `var' = 100*`var'/def_gdp
}

* capital
generate capital = ppegt * 100/invdef

* perpetual inventory
by gvkey (timevar): gen capital_pi = ppegt if _n == 1
gen net_inv = 100*D.ppent/def_inv
replace capital_pi = L.capital_pi + net_inv if !mi(L.capital_pi) & !mi(net_inv)

* generate other variables of interest
gen double r = (fedfunds - inflation) + 0.12
gen totalcosts = xopr + (capital * r)
gen totalcosts_pi = xopr + (capital_pi * r)
gen cogs_totalcosts = cogs/totalcosts
gen cogs_totalcosts_pi = cogs/totalcosts_pi
gen logcapital = log(capital)
gen logcapital_pi = log(capital_pi)
gen mktcap = prcc_f * csho

* cusip
gen temp = substr(cusip, 1, 6)
drop cusip
rename temp cusip

* total debt
egen totaldebt = rowtotal(dlc dltt)

* ratios
gen double cogs_opex = (cogs/xopr)
gen debt_asset = totaldebt/at

* logs
gen logsales = log(sales)

* leads, lags and deltas
gen leadsales = F.sales
gen pctchangesales = (leadsales-sales)/sales * 100
gen logsales_f1 = f.logsales
gen logsales_l1 = L1.logsales
gen logsales_l2 = L2.logsales
gen logsales_l3 = L3.logsales
gen logsales_l4 = L4.logsales
gen logsales_l5 = L5.logsales
gen delta_logsales_f1 = (logsales_f1-logsales)
gen debt_asset_l1 = L.debt_asset

* percentiles
egen p30 = pctile(debt_asset_l1), p(30)
egen p70 = pctile(debt_asset_l1), p(70)

* Generate SIC 1-digit industry code:
cap gen sic1 = .
replace sic1 = 1 if sic2 <= 9
replace sic1 = 2 if sic2 >= 10 & sic2 <= 14
replace sic1 = 3 if sic2 >= 15 & sic2 <= 17
replace sic1 = 4 if sic2 >= 20 & sic2 <= 39
replace sic1 = 5 if sic2 >= 40 & sic2 <= 49
replace sic1 = 6 if sic2 >= 50 & sic2 <= 51
replace sic1 = 7 if sic2 >= 52 & sic2 <= 59
replace sic1 = 8 if sic2 >= 60 & sic2 <= 67
replace sic1 = 9 if sic2 >= 70 & sic2 <= 89
replace sic1 = 10 if sic2 >= 91 & sic2 <= 98
replace sic1 = 11 if sic2 == 99

label define sic1l 1 "Agriculture, Forestry, & Fishing" 2 "Mining" 3 "Construction" 4 "Manufacturing" 5 "Transportation & Public Utilities" /*
 */ 6 "Wholesale Trade" 7 "Retail Trade" 8 "Finance, Insurance, & Real Estate" 9 "Services" 10 "Public Administration" 11 "Nonclassifiable Establishments"
label values sic1 sic1l

* relative sales 
gsort sic1 timevar
by sic1 timevar: egen totalyearindus1sales = total(sales)
gen mktshare_year_sic1 = sales/totalyearindus1sales
gsort gvkey timevar
gen mktshare_year_sic1_l1 = L.mktshare_year_sic1
egen p30_rel = pctile(mktshare_year_sic1_l1), p(30)
egen p70_rel = pctile(mktshare_year_sic1_l1), p(70)

* Label SIC 2-Digit industry code:
label define sic2l 01 "Agricultural Production – Crops" 02 "Agricultural Production – Livestock" 07 "Agricultural Services" 08 "Forestry" ///  
	09 "Fishing, Hunting, & Trapping" 10 "Metal, Mining" 12 "Coal Mining" 13 "Oil & Gas Extraction" 14 "Nonmetallic Minerals, Except Fuels" ///
	15 "General Building Contractors" 16 "Heavy Construction, Except Building" 17 "Special Trade Contractors" 20 "Food & Kindred Products" ///
	21 "Tobacco Products" 22 "Textile Mill Products" 23 "Apparel & Other Textile Products" 24 "Lumber & Wood Products" 25 "Furniture & Fixtures" ///
	26 "Paper & Allied Products" 27 "Printing & Publishing" 28 "Chemical & Allied Products" 29 "Petroleum & Coal Products" 30 "Rubber & Miscellaneous Plastics Products" ///
	31 "Leather & Leather Products" 32 "Stone, Clay, & Glass Products" 33 "Primary Metal Industries" 34 "Fabricated Metal Products" 35 "Industrial Machinery & Equipment" ///
	36 "Electronic & Other Electric Equipment" 37 "Transportation Equipment" 38 "Instruments & Related Products" 39 "Miscellaneous Manufacturing Industries" ///
	40 "Railroad Transportation" 41 "Local & Interurban Passenger Transit" 42 "Trucking & Warehousing" 43 "U.S. Postal Service" 44 "Water Transportation" ///
	45 "Transportation by Air" 46 "Pipelines, Except Natural Gas" 47 "Transportation Services" 48 "Communications" 50 "Wholesale Trade – Durable Goods" ///
	51 "Wholesale Trade – Nondurable Goods" 52 "Building Materials & Gardening Supplies" 53 "General Merchandise Stores" 54 "Food Stores" ///
	55 "Automative Dealers & Service Stations" 56 "Apparel & Accessory Stores" 57 "Furniture & Homefurnishings Stores" 58 "Eating & Drinking Places" ///
	59 "Miscellaneous Retail" 70 "Hotels & Other Lodging Places" 72 "Personal Services" 73 "Business Services" 75 "Auto Repair, Services, & Parking" ///
	76 "Miscellaneous Repair Services" 78 "Motion Pictures" 79 "Amusement & Recreation Services" 80 "Health Services" 81 "Legal Services" ///
	82 "Educational Services" 83 "Social Services" 84 "Museums, Botanical, Zoological Gardens" 86 "Membership Organizations" 87 "Engineering & Management Services" ///
	88 "Private Households" 89 "Services, Not Elsewhere Classified" 91 "Executive, Legislative, & General" 92 "Justice, Public Order, & Safety" ///
	93 "Finance, Taxation, & Monetary Policy" 94 "Administration of Human Resources" 95 "Environmental Quality & Housing" 96 "Administration of Economic Programs" ///
	97 "National Security & International Affairs" 98 "Zoological Gardens" 99 "Non-Classifiable Establishments"

label values sic2 sic2l

* create NAICS 2-Digit industry code:
tostring naics, replace
replace naics = "" if naics == "."
gen naics_2 = substr(naics, 1, 2)
replace naics_2 = "" if strlen(naics_2) != 2

replace naics_2 = "31" if naics_2 == "32" | naics_2 == "33"
replace naics_2 = "44" if naics_2 == "45"
replace naics_2 = "48" if naics_2 == "49"

* label variables
label variable sales_nominal "Nominal Sales"
label variable sales "Deflated Sales"
label variable xopr "Deflated OPEX"
label variable cogs "Deflated COGS"
label variable ppegt "Gross PPE"
label variable ppent "Net PPE"
label variable def_gdp "GDP Deflator"
label variable emp "Employment (1000s)"
label variable sic "2-Digit SIC Industry"
label variable conm "Company name"
label variable at "Assets deflated"
label variable fca "Foreign Exchange Income (Loss) Deflated"
label variable recd "Receivables Deflated - Estimated Doubtful"
label variable txs "Income Taxes State Deflated"
label variable xad "Advertising Deflated"
label variable xlr "Staff Expense Deflated - Total"
label variable xrd "Research and Development Deflated"
label variable xrent "Rent Expense Deflated"
label variable xsga "Selling, General and Administrative Expense Deflated"
label variable dldte "Effective date for deletion"
label variable dlrsn "Reason for deletion"
label variable costat "Active/Inactive"
label variable cusip "CUSIP firm code"
label variable tic "Ticker symbol code"
label variable prcc_f "Price Close - Annual - Fiscal"
label variable mkvalt "Market value deflated - fiscal - total - in millions"
label variable csho "Common Shares Outstanding - in millions"
label variable mktcap "Market value estimated (prcc_f * csho) - in millions"
label variable capital "Deflate PPEGT"
label variable capital_pi "Capital using perpetual inventory method"
label variable logcapital "log(Capital)"
label variable logcapital_pi "log(Capital from perpetual inventory method)"
label variable totalcosts "opex + (capital * r)"
label variable totalcosts_pi "opex + (capital from perpetual inventory method * r)"
label variable cogs_totalcosts "COGS/totalcosts"
label variable cogs_totalcosts_pi "COGS/totalcosts perpetual inventory method"
label variable gp "Gross profit"

/*------------------------------------ End of SECTION 2 ------------------------------------*/

/**********************************************************************/
/*  SECTION 3. Estimate markups  			
    Notes: */
/**********************************************************************/

* vars for estimating markups
clonevar invt = net_inv
gen lnq  = log(sales)
gen lnv  = log(cogs)
gen lnk  = log(capital)
gen lni  = log(invt)

gsort sic2 timevar
gen bv = 0
gen bk = 0

levelsof sic2, local(levels)
di "`levels'"
foreach i of local levels {
	di `i'
	count if sic2 == `i'
	capture prodest lnq  if sic2 == `i', method(op) free(lnv) proxy(lni) state(lnk) id(gvkey) t(timevar)
	replace bv = _b[lnv] if sic2 == `i'
	replace bk = _b[lnk] if sic2 == `i'
	outreg2 using "${data}/sd_Cleaned_sic2.xlsx", excel append ctitle(`i')
}
label variable bv "output elasticity with respect to cogs"

* get mark_up metric
gen alpha  = sales/cogs
gen markup = bv*alpha
bys timevar: egen S =  sum(sales)
gen m = sales/S
gen markup_w = m * markup
label variable markup_w "markup for each firm weighted by sales' share per year"

* logs, leads, lags, and deltas
gsort gvkey timevar
gen logmarkup = log(markup)
gen leadmarkup = F.markup
gen logmarkup_f1 = f.logmarkup
gen logmarkup_l1 = L.logmarkup
gen logmarkup_l2 = L2.logmarkup
gen logmarkup_l3 = L3.logmarkup
gen logmarkup_l4 = L4.logmarkup
gen pctchangemarkup = (leadmarkup-markup)/markup * 100
egen p30_logmarkupl1 = pctile(logmarkup_l1), p(30)
egen p70_logmarkupl1 = pctile(logmarkup_l1), p(70)


compress
gsort gvkey timevar
save "${data}/MarkupCyclicalityCleaned.dta", replace

/*------------------------------------ End of SECTION 3 ------------------------------------*/



